Exporatory Data Analysis of NEFSC Bottom Trawl Survey Data

Exploration of spatial and temporal patterns in abundance, and bodymass of fishes from the Northeast groundfish survey. Build code containing data wrangling and conversions can be accessed here.

# Do some formatting
weights_20 <- weights_20 %>% 
  mutate(
    id = as.character(id),
    season = ifelse(season %in% c("SPRING", "Spring"), "Spring", "Fall"),
    season = factor(season, levels = c("Spring", "Fall"))
  )

# Run Summary Functions
ann_means <- ss_annual_summary(weights_20, include_epu = F)
seasonals <- ss_seasonal_summary(weights_20, include_epu = F) 

# bind them so you can facet
summs <- bind_rows(ann_means, seasonals) %>% 
  mutate(season = factor(season, levels = c("Spring", "Fall", "Spring + Fall")))


#drop haddock to see if that changes the bump
haddock_ann  <- ss_annual_summary(filter(weights_20, comname != "haddock"))
haddock_seas <- ss_seasonal_summary(filter(weights_20, comname != "haddock"))
no_haddock <- bind_rows(haddock_ann, haddock_seas) %>% 
  mutate(season = factor(season, levels = c("Spring", "Fall", "Spring + Fall")))

Spatial Patterns

For large regions like Georges Bank and the Gulf of Maine, what kind of patterns are we seeing.

# Load the strata
survey_strata <- read_sf(str_c(res_path, "Shapefiles/BottomTrawlStrata/BTS_Strata.shp"))  %>% 
  clean_names() %>% 
  filter(strata >= 01010 ,
         strata <= 01760,
         strata != 1310,
         strata != 1320,
         strata != 1330,
         strata != 1350,
         strata != 1410,
         strata != 1420,
         strata != 1490) 


# Key to which strata = which regions
strata_key <- list(
  "Georges Bank"          = as.character(13:23),
  "Gulf of Maine"         = as.character(24:40),
  "Southern New England"  = str_pad(as.character(1:12), width = 2, pad = "0", side = "left"),
  "Mid-Atlantic Bight"    = as.character(61:76))


# Assign Areas
survey_strata <- survey_strata %>% 
  mutate(
    strata = str_pad(strata, width = 5, pad = "0", side = "left"),
    strata_num = str_sub(strata, 3, 4),
    area = case_when(
      strata_num %in% strata_key$`Georges Bank` ~ "Georges Bank",
      strata_num %in% strata_key$`Gulf of Maine` ~ "Gulf of Maine",
      strata_num %in% strata_key$`Southern New England` ~ "Southern New England",
      strata_num %in% strata_key$`Mid-Atlantic Bight` ~ "Mid-Atlantic Bight",
    TRUE ~ "Outside Major Study Areas"
  )) %>% 
  select(finstr_id, strata, strata_num, area, a2, str2, set, stratuma, str3, geometry)


# Make trawl data an sf dataset
trawl_sf <- weights_20 %>% st_as_sf(coords = c("decdeg_beglon", "decdeg_beglat"), crs = 4326)

Trawl Regions

# Plot to check
ggplot() +
  geom_sf(data = new_england) +
  geom_sf(data = canada) +
  geom_sf(data = survey_strata, aes(fill = area)) +
  coord_sf(xlim = c(-77, -65.5), ylim = c(34, 45.75), expand = FALSE) +
  guides(fill = guide_legend(nrow = 2)) +
  theme_bw() +
  theme(legend.position = "bottom", legend.title = element_blank())

Ecological Production Units

epu_sf <- ecodata::epu_sf

ggplot() +
  geom_sf(data = new_england) +
  geom_sf(data = canada) +
  geom_sf(data = epu_sf, aes(fill = EPU)) +
  coord_sf(xlim = c(-77, -65.5), ylim = c(34, 45.75), expand = FALSE) +
  guides(fill = guide_legend(nrow = 2)) +
  theme_bw() +
  theme(legend.position = "bottom", legend.title = element_blank())

Regional Summaries

# Just Area, all seasona
area_summs <- weights_20 %>% 
  group_by(survey_area) %>% 
  summarise(
    season = "Spring + Fall",
    lw_biomass_kg = sum(sum_weight_kg, na.rm = T),
    n_stations = n_distinct(id),
    lw_biomass_per_station = lw_biomass_kg / n_stations,
    mean_ind_bodymass = weighted.mean(ind_weight_kg, weights = numlen_adj),
    mean_ind_length = weighted.mean(length, weights = numlen_adj),
    .groups = "keep"
  ) %>% 
  ungroup()

# Area x Season
seas_area <- weights_20 %>% 
  group_by(survey_area, season) %>% 
  summarise(
    lw_biomass_kg = sum(sum_weight_kg, na.rm = T),
    n_stations = n_distinct(id),
    lw_biomass_per_station = lw_biomass_kg / n_stations,
    mean_ind_bodymass = weighted.mean(ind_weight_kg, weights = numlen_adj),
    mean_ind_length = weighted.mean(length, weights = numlen_adj),
    .groups = "keep"
  ) %>% 
  ungroup()


# Combine those two
summs_combined <- bind_rows(area_summs, seas_area) %>% 
  mutate(season = factor(season, levels = c("Spring", "Fall", "Spring + Fall")))

summs_combined %>% 
  mutate_if(is.numeric,round, 2) %>% 
  arrange(survey_area,season) %>% 
  knitr::kable()
survey_area season lw_biomass_kg n_stations lw_biomass_per_station mean_ind_bodymass mean_ind_length
GB Spring 328600.2 2531 129.83 1.08 43.14
GB Fall 558771.2 2259 247.35 0.89 40.91
GB Spring + Fall 887371.4 4790 185.25 0.99 42.08
GoM Spring 268460.9 2963 90.60 0.80 38.27
GoM Fall 585521.9 2914 200.93 0.87 39.96
GoM Spring + Fall 853982.8 5877 145.31 0.83 39.14
MAB Spring 462469.6 1679 275.44 1.21 56.22
MAB Fall 136608.2 1520 89.87 1.64 28.63
MAB Spring + Fall 599077.8 3199 187.27 1.31 49.56
SNE Spring 290320.8 2180 133.17 0.75 41.81
SNE Fall 186327.3 2010 92.70 0.72 38.46
SNE Spring + Fall 476648.0 4190 113.76 0.74 40.65
# Year x Area
area_summs_y <- weights_20 %>% 
  group_by(est_year, survey_area) %>% 
  summarise(
    season                 = "Spring + Fall",
    lw_biomass_kg          = sum(sum_weight_kg, na.rm = T),
    n_stations             = n_distinct(id),
    lw_biomass_per_station = lw_biomass_kg / n_stations,
    mean_ind_bodymass      = weighted.mean(ind_weight_kg, weights = numlen_adj),
    mean_ind_length        = weighted.mean(length, weights = numlen_adj),
    expanded_abund         = sum(expanded_abund_s),
    expanded_lwbio         = sum(expanded_lwbio_s),
    expanded_biom          = sum(expanded_biom_s),
    .groups = "keep"
  ) %>% 
  ungroup()


# weights_20 %>% 
#   ggplot(aes(est_year, sum_weight_kg)) +
#   geom_boxplot(aes(group = est_year))

Total Biomass

Total biomass in the figures below is derived from weight at length relationships and reflects the mean expected biomass of the fish caught using the observed distribution and frequency of lengths.

area_summs_y %>% 
  ggplot(aes(est_year, lw_biomass_kg)) +
    geom_line() +
    facet_wrap(~survey_area, ncol = 2) +
    scale_y_continuous(labels = scales::comma_format()) +
    labs(x = "", y = "Total Biomass (kg)")

CPUE

CPUE displayed below is the mean total biomass, derived from weight at length relationships, per station for each region. These values have not been weighted to reflect the difference in area between regions.

area_summs_y %>% 
  ggplot(aes(est_year, lw_biomass_per_station)) +
    geom_line() +
    facet_wrap(~survey_area, ncol = 2) +
    labs(x = "", y = "Adjusted Biomass per Station (kg)")

Area Expanded Abundance

Area expanded abundance is calculated by taking the catch per station rates for each length class of each species and extending that density to the total areas of the region. Catchability is assumed to be 1 for this calculation though it is certainly lower. This extension of catch rates is highly sensitive to large swings due to high variances and zero inflation of abundances.

area_summs_y %>% 
  ggplot(aes(est_year, expanded_abund/1000000)) +
    geom_line() +
    scale_y_continuous(labels = scales::comma_format()) +
    facet_wrap(~survey_area, ncol = 2) +
    labs(x = "", y = "Area Expanded Abundance (millions)")

Area Expanded Biomass

The expected biomass estimates for the survdat$biomass data and the expected biomass from L-W regressions differ wildly once the survey transitions over to the Henry Bigelow. Not sure at this point if that is because the coefficients are off, or if there are some specific stations that need to be investigated, but its quite a large difference:

fscs <- area_summs_y %>% 
  ggplot() +
    geom_line(aes(est_year, expanded_biom  /1000000, color = "Shipboard Weights")) +
    scale_y_continuous(labels = scales::comma_format()) +
    facet_wrap(~survey_area, ncol = 1) +
    scale_color_gmri(reverse = T) +
    labs(x = "", y = "Area Expanded Biomass FSCS (million kg)") +
    theme(legend.position = "bottom", legend.title = element_blank())

lw <- area_summs_y %>% 
  ggplot() +
    geom_line(aes(est_year, expanded_lwbio /1000000, color = "LW Regression Weights")) +
    scale_y_continuous(labels = scales::comma_format()) +
    scale_color_gmri(reverse = F) +
    facet_wrap(~survey_area, ncol = 1) +
    labs(x = "", y = "Area Expanded Biomass LW (million kg)") +
    theme(legend.position = "bottom", legend.title = element_blank())

fscs | lw

Comparisons to Older Data

Concerns have been raised that the datasets obtained through the NEFSC are inconsistent in some areas over time. The following plots seek to identify differences in a dataset obtained in 2016 from what we currently are exploring with the 2020 dataset.

Each file was processed for size-spectra analysis using the same processing steps. This includes the same species codes, the same abundance and stratification adjustments, and the same L-W derived biomasses.

NOTE: The resulting analysis has been removed to shorten the length of this markdown.

# weights_16 <- read_csv(here::here("data/ss_prepped_data/survdat_2016_ss.csv"),
#                        col_types = cols(),
#                        guess_max = 1e5)
# weights_19 <- read_csv(here::here("data/ss_prepped_data/survdat_2019_ss.csv"),
#                        col_types = cols(),
#                        guess_max = 1e5)
# weights_20 <- read_csv(here::here("data/ss_prepped_data/survdat_2020_ss.csv"),
#                        col_types = cols(),
#                        guess_max = 1e5)
# 
# # run summaries
# summ_16 <- ss_regional_differences(weights_16) %>% mutate(source = "2016")
# summ_19 <- ss_regional_differences(weights_19) %>% mutate(source = "2019")
# summ_20 <- ss_regional_differences(weights_20) %>% mutate(source = "2020")
# reg_summs <- bind_rows(list(summ_16, summ_19, summ_20))
# # Total Biomass
# p1 <- reg_summs %>% 
#   ggplot(aes(est_year, lw_biomass_kg, color = source)) +
#   geom_line(show.legend = F) +
#   scale_y_continuous(labels = scales::comma_format()) +
#   facet_wrap(~survey_area, ncol = 1, scales = "free") +
#   labs(x = "", y = "Total Biomass \n (L-W Regressions)")
# 
# # Total Biomass - FSCS
# p2 <- reg_summs %>% 
#   ggplot(aes(est_year, fscs_biomass_kg, color = source)) +
#   geom_line() +
#   scale_y_continuous(labels = scales::comma_format()) +
#   facet_wrap(~survey_area, ncol = 1) +
#   labs(x = "", y = "Total Biomass \n (FSCS Haul Weights)")
# 
# # effort
# p3 <- reg_summs %>% 
#   ggplot(aes(est_year, n_stations, color = source)) +
#   geom_line(show.legend = F) +
#   scale_y_continuous(labels = scales::comma_format()) +
#   facet_wrap(~survey_area, ncol = 1) +
#   labs(x = "", y = "Effort (haul count)")
# 
# # Species 
# p4 <- reg_summs %>% 
#   ggplot(aes(est_year, n_species, color = source)) +
#   geom_line(show.legend = F) +
#   scale_y_continuous(labels = scales::comma_format()) +
#   facet_wrap(~survey_area, ncol = 1) +
#   labs(x = "", y = "Distinct Species")
# 
# p1 + p2 + p3 + p4
 

A work by Adam A. Kemberling

Akemberling@gmri.org